package com.suyunyou.spider.utils;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.suyunyou.spider.data.FetcherAnalysisLinkData;
import com.suyunyou.spider.data.LinkData;
import com.suyunyou.spider.model.Link;
import com.suyunyou.spider.plugins.ILinkPlugin;
import com.system.comm.utils.FrameStringUtil;

/**
 * 链接的工具类
 * @author 岳静
 * @date 2016年6月24日 下午4:12:48 
 * @version V1.0
 */
public class AnalysisLinkUtil {

	private static final Logger LOGGER = LoggerFactory.getLogger(AnalysisLinkUtil.class);
	/**
	 * 提取链接放入待处理的数据中
	 * @param url		来源地址
	 * @param content
	 */
	public static void fetcher(String url) {
		//得到url的域名
		Link l = LinkData.get(url);
		if(l == null) {
			return;
		}
		String content = l.getContent();
		if(FrameStringUtil.isEmpty(content)) {
			LOGGER.error("分析链接的内容为空: " + content);
			return;
		}
		List<String> links = new ArrayList<String>();
		List<ILinkPlugin> linkPlugins = PluginUtil.getLinkPlugins();
		for (ILinkPlugin linkPlugin : linkPlugins) {
			links.addAll(linkPlugin.getLinks(content));
		}
		/*links.addAll(getDefaultLinks(content));
		links.addAll(getIframeLinks(content));
		links.addAll(getFrameLinks(content));*/
		for (String link : links) {
			if(link.startsWith("javascript:")) {
				//剔除无效链接
				continue;
			}
			int endIndex = link.indexOf("#");
			if(endIndex != -1) {
				String endString = link.substring(endIndex, link.length());
				Pattern pattern = Pattern.compile("#[\\w|\\d]+", Pattern.MULTILINE);
				Matcher matcher = pattern.matcher(endString);
				if(matcher.find()) {
					//剔除#号后面的内容
					link = link.substring(0, endIndex);
				}
			}
			link = dealDomain(l, link);
			//判断是否为当前域名下的网站
			if(!link.startsWith(l.getDomain()) && !link.equals(l.getLink())) {
				continue;
			}
			LinkData.addFetcherLink(link, l.getSiteId());
		}
		//更新信息
		/*l.setIsFetcherLink(Boolean.TRUE.getCode());
		l.setFetcherLinkTime(FrameTimeUtil.getTime());*/
		FetcherAnalysisLinkData.update(l.getLink());
	}

	private static String dealDomain(Link l, String link) {
		if(!link.startsWith("http")) {
			//String domain = SpiderUtil.getDomain(l.getLink());
			String domain = SpiderUtil.getDomain(l.getDomain());
			if(link.startsWith("../")) {
				//将返回上级目录类型的去掉
				link = link.replaceAll("../", "");
				link = domain + ( link.startsWith("/") ? link : "/" + link );
			} else if(link.startsWith("/")) {
				//没有/开始，则取 link地址的开始到最后一个/
				link = domain + link;
			} else if(!link.startsWith("/")) {
				//没有/开始，则取 link地址的开始到最后一个/
				link = l.getLink().substring(0, l.getLink().lastIndexOf("/") + 1) + link;
			}
		}
		return link;
	}
/*
	private static List<String> getDefaultLinks(String content) {
		List<String> list = new ArrayList<String>();
		Pattern pattern = Pattern.compile("(?i)(?s)<\\s*?a.*?href=\"(.*?)\".*?>");
		Matcher matcher = pattern.matcher(content);

		while (matcher.find()) {
			list.add(matcher.group(1));
		}
		return list;
	}

	private static List<String> getFrameLinks(String content) {
		List<String> list = new ArrayList<String>();
		Pattern pattern = Pattern.compile("(?i)(?s)<\\s*?frame.*?src=\"(.*?)\".*?>");
		Matcher matcher = pattern.matcher(content);

		while (matcher.find()) {
			list.add(matcher.group(1));
		}
		return list;
	}

	private static List<String> getIframeLinks(String content) {
		List<String> list = new ArrayList<String>();
		Pattern pattern = Pattern.compile("(?i)(?s)<\\s*?iframe.*?href=\"(.*?)\".*?>");
		Matcher matcher = pattern.matcher(content);

		while (matcher.find()) {
			list.add(matcher.group(1));
		}
		return list;
	}*/

}